In [103]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'
In [104]:
df=pd.read_csv('Cleaned_Clothing.csv')
In [105]:
df.head()
Out[105]:
Customer ID Customer Name Product Product Category Size Color Cost (in NPR) Quantity Total Cost (in NPR) Purchase Date Store Location Purchase Method Foot Traffic Gender Product Rating Customer Type
0 CUST001 Customer_1 Dress Casualwear L Black 1800 4 7200 2024-04-30 Bhaktapur Visited Store 132 Male 2 New
1 CUST002 Customer_2 Shirt Casualwear L Blue 1737 2 3474 2023-12-17 Lalitpur Website 471 Female 2 New
2 CUST003 Customer_3 Skirt Bottomwear S Black 648 3 1944 2023-11-19 Boudha Visited Store 488 Male 4 Returning
3 CUST004 Customer_4 Blazer Bottomwear XL Yellow 2603 1 2603 2024-04-30 Thamel Visited Store 309 Male 2 Returning
4 CUST005 Customer_5 T-shirt Outerwear M Blue 780 1 780 2024-09-16 Bhaktapur Visited Store 395 Male 2 Returning
In [106]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Customer ID          100 non-null    object
 1   Customer Name        100 non-null    object
 2   Product              100 non-null    object
 3   Product Category     100 non-null    object
 4   Size                 100 non-null    object
 5   Color                100 non-null    object
 6   Cost (in NPR)        100 non-null    int64 
 7   Quantity             100 non-null    int64 
 8   Total Cost (in NPR)  100 non-null    int64 
 9   Purchase Date        100 non-null    object
 10  Store Location       100 non-null    object
 11  Purchase Method      100 non-null    object
 12  Foot Traffic         100 non-null    int64 
 13  Gender               100 non-null    object
 14  Product Rating       100 non-null    int64 
 15  Customer Type        100 non-null    object
dtypes: int64(5), object(11)
memory usage: 12.6+ KB
In [107]:
# Sorting
df = df.sort_values("Customer Name")
df.head()
Out[107]:
Customer ID Customer Name Product Product Category Size Color Cost (in NPR) Quantity Total Cost (in NPR) Purchase Date Store Location Purchase Method Foot Traffic Gender Product Rating Customer Type
0 CUST001 Customer_1 Dress Casualwear L Black 1800 4 7200 2024-04-30 Bhaktapur Visited Store 132 Male 2 New
9 CUST010 Customer_10 Shorts Bottomwear S Red 623 1 623 2024-05-13 Kathmandu Visited Store 144 Female 5 Returning
99 CUST100 Customer_100 T-shirt Topwear M Red 1287 5 6435 2024-01-13 Bhaktapur Visited Store 150 Male 5 New
10 CUST011 Customer_11 Blazer Formalwear M Yellow 2142 2 4284 2024-09-29 Thamel Visited Store 418 Male 2 Returning
11 CUST012 Customer_12 T-shirt Topwear L Pink 620 1 620 2024-02-21 Thamel Visited Store 127 Male 5 New
In [108]:
df.isnull().sum()
Out[108]:
Customer ID            0
Customer Name          0
Product                0
Product Category       0
Size                   0
Color                  0
Cost (in NPR)          0
Quantity               0
Total Cost (in NPR)    0
Purchase Date          0
Store Location         0
Purchase Method        0
Foot Traffic           0
Gender                 0
Product Rating         0
Customer Type          0
dtype: int64
In [109]:
# Dropping Unnecessary Columns
df = df.drop(["Customer ID", "Foot Traffic"], axis=1)
df.head()
Out[109]:
Customer Name Product Product Category Size Color Cost (in NPR) Quantity Total Cost (in NPR) Purchase Date Store Location Purchase Method Gender Product Rating Customer Type
0 Customer_1 Dress Casualwear L Black 1800 4 7200 2024-04-30 Bhaktapur Visited Store Male 2 New
9 Customer_10 Shorts Bottomwear S Red 623 1 623 2024-05-13 Kathmandu Visited Store Female 5 Returning
99 Customer_100 T-shirt Topwear M Red 1287 5 6435 2024-01-13 Bhaktapur Visited Store Male 5 New
10 Customer_11 Blazer Formalwear M Yellow 2142 2 4284 2024-09-29 Thamel Visited Store Male 2 Returning
11 Customer_12 T-shirt Topwear L Pink 620 1 620 2024-02-21 Thamel Visited Store Male 5 New
In [110]:
# Rename the column in the DataFrame
df.rename(columns={'Cost (in NPR)': 'Cost'}, inplace=True)
df.rename(columns={'Total Cost (in NPR)': 'Total_Cost'}, inplace=True)
In [111]:
#mapping visited store  as 1 and website for 0
purchase_method_mapping= {"Visited Store": 1, "Website": 0}
df["Purchase Method"] = df["Purchase Method"].map(purchase_method_mapping)
In [112]:
df.head()
Out[112]:
Customer Name Product Product Category Size Color Cost Quantity Total_Cost Purchase Date Store Location Purchase Method Gender Product Rating Customer Type
0 Customer_1 Dress Casualwear L Black 1800 4 7200 2024-04-30 Bhaktapur 1 Male 2 New
9 Customer_10 Shorts Bottomwear S Red 623 1 623 2024-05-13 Kathmandu 1 Female 5 Returning
99 Customer_100 T-shirt Topwear M Red 1287 5 6435 2024-01-13 Bhaktapur 1 Male 5 New
10 Customer_11 Blazer Formalwear M Yellow 2142 2 4284 2024-09-29 Thamel 1 Male 2 Returning
11 Customer_12 T-shirt Topwear L Pink 620 1 620 2024-02-21 Thamel 1 Male 5 New
In [113]:
#mapping customer type as 1 for new and 0 for existing
customer_type_mapping= {
    "New": 1, 
    "Returning": 0
    }
df["Customer Type"] = df["Customer Type"].map(customer_type_mapping)
df.head()
Out[113]:
Customer Name Product Product Category Size Color Cost Quantity Total_Cost Purchase Date Store Location Purchase Method Gender Product Rating Customer Type
0 Customer_1 Dress Casualwear L Black 1800 4 7200 2024-04-30 Bhaktapur 1 Male 2 1
9 Customer_10 Shorts Bottomwear S Red 623 1 623 2024-05-13 Kathmandu 1 Female 5 0
99 Customer_100 T-shirt Topwear M Red 1287 5 6435 2024-01-13 Bhaktapur 1 Male 5 1
10 Customer_11 Blazer Formalwear M Yellow 2142 2 4284 2024-09-29 Thamel 1 Male 2 0
11 Customer_12 T-shirt Topwear L Pink 620 1 620 2024-02-21 Thamel 1 Male 5 1
In [114]:
#mapping male as 1 and female as 0
gender_mapping= {
    'Male': 1,
    'Female': 0
    }
df["Gender"] = df["Gender"].map(gender_mapping)
In [115]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 98
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Customer Name     100 non-null    object
 1   Product           100 non-null    object
 2   Product Category  100 non-null    object
 3   Size              100 non-null    object
 4   Color             100 non-null    object
 5   Cost              100 non-null    int64 
 6   Quantity          100 non-null    int64 
 7   Total_Cost        100 non-null    int64 
 8   Purchase Date     100 non-null    object
 9   Store Location    100 non-null    object
 10  Purchase Method   100 non-null    int64 
 11  Gender            100 non-null    int64 
 12  Product Rating    100 non-null    int64 
 13  Customer Type     100 non-null    int64 
dtypes: int64(7), object(7)
memory usage: 11.7+ KB
In [116]:
#mapping the size
size_mapping= {
    'S': 1,
    'M': 2,
    'L': 3,
    'XL': 4
    }
df["Size"] = df["Size"].map(size_mapping)
In [117]:
# Ratings Category (Low, Medium, High)
def categorize_rating(rating):
    if rating <= 2.5:
        return 'Low'
    elif rating <= 3.5:
        return 'Medium'
    else:
        return 'High'

df['Rating_Categoy'] = df['Product Rating'].apply(categorize_rating)
df.head()
Out[117]:
Customer Name Product Product Category Size Color Cost Quantity Total_Cost Purchase Date Store Location Purchase Method Gender Product Rating Customer Type Rating_Categoy
0 Customer_1 Dress Casualwear 3 Black 1800 4 7200 2024-04-30 Bhaktapur 1 1 2 1 Low
9 Customer_10 Shorts Bottomwear 1 Red 623 1 623 2024-05-13 Kathmandu 1 0 5 0 High
99 Customer_100 T-shirt Topwear 2 Red 1287 5 6435 2024-01-13 Bhaktapur 1 1 5 1 High
10 Customer_11 Blazer Formalwear 2 Yellow 2142 2 4284 2024-09-29 Thamel 1 1 2 0 Low
11 Customer_12 T-shirt Topwear 3 Pink 620 1 620 2024-02-21 Thamel 1 1 5 1 High
In [118]:
df['Product Category'].unique()
Out[118]:
array(['Casualwear', 'Bottomwear', 'Topwear', 'Formalwear', 'Outerwear'],
      dtype=object)
In [119]:
#mapping product wear
product_category_mapping= {
    'Casualwear': 1,
    'Formalwear': 2,
    'Bottomwear': 3,
    'Outerwear': 4,
    'Topwear': 5
    }
df["Product Category"] = df["Product Category"].map(product_category_mapping)

Visualizations¶

In [120]:
#visualization based on the store location and the purchase method
import plotly.express as px

df['store'] = df['Store Location'].astype('category')
fig = px.scatter(df, x='store', y='Cost', color='Purchase Method', 
                 title='Scatter Plot of Cost by Store Location and Purchase Method')
fig.show()
In [121]:
# Group by Gender and calculate the average Cost
grouped_df = df.groupby('Gender')['Cost'].mean().reset_index()

# Create a bar plot with specific colors for male and female
fig = px.bar(grouped_df, x='Gender', y='Cost', title='Average Cost by Gender',
             color='Gender', color_discrete_map={'Male': 'blue', 'Female': 'pink'})
fig.show()
In [122]:
# Group by Store Location and sum the Total Cost
grouped_df = df.groupby('Store Location')['Total_Cost'].sum().reset_index()

# Create a bar plot
fig = px.bar(grouped_df, x='Store Location', y='Total_Cost', title='Total Cost by Store Location')
fig.show()
In [123]:
# Group by Purchase Method and sum the Quantity
grouped_df = df.groupby('Purchase Method')['Quantity'].sum().reset_index()
fig = px.pie(grouped_df, names='Purchase Method', values='Quantity', title='Total Quantity by Purchase Method', hole=0.3)
fig.show()
In [124]:
# Group by Product Category and Size, and sum the Total Cost
grouped_df = df.groupby(['Product Category', 'Size'])['Total_Cost'].sum().reset_index()

# Create a line plot
fig = px.line(grouped_df, x='Product Category', y='Total_Cost', color='Size', 
              title='Total Cost by Product Category and Size', markers=True)
fig.show()
In [125]:
# Group by Product Category and calculate the average Product Rating
grouped_df = df.groupby('Product Category')['Product Rating'].mean().reset_index()
fig = px.scatter(grouped_df, x='Product Category', y='Product Rating', title='Average Product Rating by Product Category',
                 size='Product Rating', color='Product Category')
fig.show()
In [126]:
# Group by Store Location and sum the Total Cost
grouped_df = df.groupby('Store Location')['Total_Cost'].sum().reset_index()
fig = px.bar(grouped_df, x='Store Location', y='Total_Cost', title='Total Cost by Store Location',color='Store Location')
fig.show()
In [127]:
# Group by Gender and Purchase Method, and count the occurrences
grouped_df = df.groupby(['Gender', 'Purchase Method']).size().reset_index(name='Count')
fig = px.pie(grouped_df, names='Purchase Method', values='Count', 
             title='Gender-wise Purchase Method Distribution', facet_col='Gender')
fig.show()
In [128]:
# Group by Product Category and calculate the average Product Rating
grouped_df = df.groupby('Product Category')['Product Rating'].mean().reset_index()
fig = px.bar(grouped_df, x='Product Category', y='Product Rating', title='Average Product Rating by Product Category',
             color='Product Category')
fig.show()
In [129]:
df.columns
Out[129]:
Index(['Customer Name', 'Product', 'Product Category', 'Size', 'Color', 'Cost',
       'Quantity', 'Total_Cost', 'Purchase Date', 'Store Location',
       'Purchase Method', 'Gender', 'Product Rating', 'Customer Type',
       'Rating_Categoy', 'store'],
      dtype='object')
In [130]:
df.describe()
Out[130]:
Product Category Size Cost Quantity Total_Cost Purchase Method Gender Product Rating Customer Type
count 100.00000 100.000000 100.000000 100.000000 100.00000 100.000000 100.000000 100.00000 100.000000
mean 3.15000 2.510000 2634.640000 3.230000 8592.31000 0.510000 0.440000 2.99000 0.480000
std 1.34371 1.029808 1383.482113 1.462356 6176.86699 0.502418 0.498888 1.46677 0.502117
min 1.00000 1.000000 533.000000 1.000000 620.00000 0.000000 0.000000 1.00000 0.000000
25% 2.00000 2.000000 1326.750000 2.000000 3452.25000 0.000000 0.000000 2.00000 0.000000
50% 3.00000 3.000000 2550.500000 3.000000 7446.00000 1.000000 0.000000 3.00000 0.000000
75% 4.00000 3.000000 3888.500000 5.000000 12371.25000 1.000000 1.000000 4.00000 1.000000
max 5.00000 4.000000 4979.000000 5.000000 24695.00000 1.000000 1.000000 5.00000 1.000000
In [131]:
corr = df.corr(numeric_only=1)
corr
Out[131]:
Product Category Size Cost Quantity Total_Cost Purchase Method Gender Product Rating Customer Type
Product Category 1.000000 -0.019344 0.047035 -0.063999 0.044899 -0.009725 0.036163 0.149395 0.251515
Size -0.019344 1.000000 0.197383 -0.018311 0.153153 -0.117332 -0.126617 -0.103585 -0.028911
Cost 0.047035 0.197383 1.000000 0.041151 0.722298 0.091731 -0.112076 -0.231336 0.065437
Quantity -0.063999 -0.018311 0.041151 1.000000 0.639537 -0.161267 -0.070889 0.076431 0.013206
Total_Cost 0.044899 0.153153 0.722298 0.639537 1.000000 -0.001962 -0.160343 -0.082812 0.052109
Purchase Method -0.009725 -0.117332 0.091731 -0.161267 -0.001962 1.000000 0.143465 -0.061544 -0.059259
Gender 0.036163 -0.126617 -0.112076 -0.070889 -0.160343 0.143465 1.000000 -0.021534 -0.125809
Product Rating 0.149395 -0.103585 -0.231336 0.076431 -0.082812 -0.061544 -0.021534 1.000000 -0.075707
Customer Type 0.251515 -0.028911 0.065437 0.013206 0.052109 -0.059259 -0.125809 -0.075707 1.000000
In [132]:
# Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='magma', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()
No description has been provided for this image

model Training¶

In [133]:
Features = df[['Total_Cost','Purchase Method','Product Category','Size','Quantity']]
Target = df['Cost']
In [134]:
#splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Features, Target, test_size=0.2, random_state=0)
In [135]:
# Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score

gbr = GradientBoostingRegressor()
# Train the model
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
In [136]:
# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Absolute Error: ", mae)
print("R2 Score: ", r2)
Mean Absolute Error:  200.90494036792384
R2 Score:  0.9729606885493439
In [137]:
# Visualization of predicted vs actual costs
fig_cost = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y_test.min(), y0=y_test.min(),
    x1=y_test.max(), y1=y_test.max()
)
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()

cost_list = [[predicted] for actual, predicted in zip(y_test, y_pred)]
In [138]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score

gbr = GradientBoostingRegressor(n_estimators=300, learning_rate=0.2, max_depth=3, random_state=42,loss='squared_error')
gbr.fit(X_train, y_train)

y_pred = gbr.predict(X_test)

cv_scores_r2 = cross_val_score(gbr, Features, Target, cv=5, scoring='r2')
print("Cross-validation R2 scores for Gradient Boosting Regressor:", cv_scores_r2)
print("Mean R2 score:", cv_scores_r2.mean())

# Perform cross-validation for Mean Absolute Error
cv_scores_mae = cross_val_score(gbr, Features, Target, cv=5, scoring='neg_mean_absolute_error')
cv_scores_mae = -cv_scores_mae  # Convert to positive values
print("Cross-validation MAE scores for Gradient Boosting Regressor:", cv_scores_mae)
print("Mean MAE score:", cv_scores_mae.mean())
Cross-validation R2 scores for Gradient Boosting Regressor: [0.97341073 0.96872318 0.9516513  0.94338487 0.95744448]
Mean R2 score: 0.9589229130407503
Cross-validation MAE scores for Gradient Boosting Regressor: [180.71856597 207.74159584 229.73779601 201.30292391 215.83694738]
Mean MAE score: 207.06756582212523

Random Forest Regressor¶

In [139]:
from sklearn.ensemble import RandomForestRegressor
# basic model
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
y_pred = rfr.predict(X_test)
In [140]:
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Absolute Error: ", mae)
print("R2 Score: ", r2)
Mean Absolute Error:  419.21450000000004
R2 Score:  0.8840462075478562
In [145]:
#plot tree diagram based model
from sklearn.tree import plot_tree
plt.figure(figsize=(20, 10))
plot_tree(rfr.estimators_[0], filled=True, feature_names=Features.columns)
plt.show()
No description has been provided for this image

Hyperparameter and cross validation

In [141]:
#hyperparameter tuning for random forest regressor
rfr_params = {
    'n_estimators': 300,
    'max_depth': 3,
    'random_state': 42
}
rfr = RandomForestRegressor(**rfr_params)

# Fit the model
rfr.fit(X_train, y_train)

# Predict on the test set
y_pred_reg = rfr.predict(X_test)

# Perform cross-validation for R2 score
cv_scores_r2 = cross_val_score(rfr, Features, Target, cv=5, scoring='r2')
print("Cross-validation R2 scores for Random Forest Regressor:", cv_scores_r2)
print("Mean R2 score:", cv_scores_r2.mean())

# Perform cross-validation for Mean Absolute Error
cv_scores_mae = cross_val_score(rfr, Features, Target, cv=5, scoring='neg_mean_absolute_error')
cv_scores_mae = -cv_scores_mae 
print("Cross-validation MAE scores for Random Forest Regressor:", cv_scores_mae)
print("Mean MAE score:", cv_scores_mae.mean())
Cross-validation R2 scores for Random Forest Regressor: [0.85507448 0.80738194 0.81554827 0.7633761  0.77008654]
Mean R2 score: 0.8022934659816918
Cross-validation MAE scores for Random Forest Regressor: [403.70004968 564.66271815 468.18076216 559.51067536 584.78026227]
Mean MAE score: 516.1668935224409
In [142]:
# Visualization of predicted vs actual costs after using Random Forest Regressor
fig_cost = px.scatter(x=y_test, y=y_pred_reg, labels={'x': 'Actual Cost', 'y': 'Predicted Cost'}, title='Actual vs Predicted Cost')
fig_cost.add_shape(
    type="line",
    x0=y_test.min(),
    y0=y_test.min(),
    x1=y_test.max(),
    y1=y_test.max(),
    line=dict(color="Red"),
    )
fig_cost.update_layout(paper_bgcolor="white")
fig_cost.show()
In [143]:
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True)
fig = go.Figure(data=[go.Scatter(x=[1, 2, 3], y=[4, 5, 6])])
iplot(fig)
In [ ]: